Here, we compare the estimated counts with the ground truth.

library(here)
## here() starts at /Users/tomsmith/git_repos/bioinf_projects/2022/06_tRNA/trna_seq_quantification_benchmarking
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(camprotR)
## Warning in fun(libname, pkgname): mzR has been built against a different Rcpp version (1.0.7)
## than is installed on your system (1.0.8.3). This might lead to errors
## when loading mzR. If you encounter such issues, please send a report,
## including the output of sessionInfo() to the Bioc support forum at 
## https://support.bioconductor.org/. For details see also
## https://github.com/sneumann/mzR/wiki/mzR-Rcpp-compiler-linker-issue.
library(ggbeeswarm)
library(pheatmap)
library(RColorBrewer)


source('../R/plot_aes.R')

Define the infiles

infiles <- NULL
infiles[['isodecoder']] = Sys.glob(here('input/*final_results.dir/simulation_realistic*CompareTruthEstimateIsodecoder.tsv'))

infiles[['mimseq_isodecoder']] = Sys.glob(here('input/*final_results.dir/simulation_realistic*CompareTruthEstimateMimseqIsodecoder.tsv'))

infiles[['anticodon']] = Sys.glob(here('input/*final_results.dir/simulation_realistic*CompareTruthEstimateAnticodon.tsv'))

infiles[['individual']] = Sys.glob(here('input/*final_results.dir/simulation_realistic*CompareTruthEstimate.tsv'))

Read the infiles and normalise the estimated read counts (NumReads) and the ground truth (truth) by the total counts.

counts_vs_truth <- infiles %>% lapply(function(x)
  lapply(x, function(y){
    data <- read.delim(y, sep='\t') %>%
      group_by(input_file, simulation_n, quant_method, tally_method) %>%
      mutate(NumReads_norm=1E6*NumReads/(sum(NumReads)),
             truth_norm=1E6*truth/(sum(truth))) %>%
      mutate(trna_seq_method=sapply(strsplit(input_file, split='_'), '[[', 1)) %>%
      mutate(species=ifelse(trna_seq_method == 'quantMtRNAseq', 'Mus musculus', 'Homo sapiens')) %>%
      mutate(sample=gsub('mimtRNAseq_Hsap_', '', input_file))

    return(data)

  }) %>% bind_rows())

# some renaming is required for the Mt Leu and Ser genes
counts_vs_truth$anticodon <- counts_vs_truth$anticodon %>%
  mutate(Name=recode(Name,
                     'Homo_sapiens_MTtRNA-Leu-TAG'='Homo_sapiens_MTtRNA-Leu1-TAG',
                     'Homo_sapiens_MTtRNA-Leu-TAA'='Homo_sapiens_MTtRNA-Leu2-TAA',
                     'Homo_sapiens_MTtRNA-Ser-GCT'='Homo_sapiens_MTtRNA-Ser1-GCT',
                     'Homo_sapiens_MTtRNA-Ser-TGA'='Homo_sapiens_MTtRNA-Ser2-TGA'))

How many reads were used for the quantifications.

to_plot <- counts_vs_truth %>%
  bind_rows(.id='level') %>%
  filter(level %in% c('anticodon', 'mimseq_isodecoder')) %>%
  mutate(tally_method=lapply(tally_method, function(x) tally_methods_rename[[x]])) %>%
  mutate(tally_method=factor(tally_method, levels=tally_methods_rename)) %>%
  filter(!grepl('shrimp', quant_method)) %>%
  group_by(level, input_file, simulation_n, quant_method, tally_method, trna_seq_method, sample, species) %>%
  summarise(sum_truth=sum(truth), sum_numreads=sum(NumReads)) %>%
  mutate(fraction_assigned=sum_numreads/sum_truth) %>%
  rowwise() %>%
  mutate(level=rename_level(level)) %>%
  mutate(quant_method=factor(rename_quant(quant_method), levels=quant_rename)) %>%
  mutate(trna_seq_method=factor(rename_trna_method(trna_seq_method), levels=trna_method_rename))
## `summarise()` has grouped output by 'level', 'input_file', 'simulation_n',
## 'quant_method', 'tally_method', 'trna_seq_method', 'sample'. You can override
## using the `.groups` argument.
p <- to_plot %>%
  ggplot(aes(trna_seq_method, fraction_assigned, colour=tally_method)) +
  geom_boxplot(position='dodge') +
  theme_bw(base_size=12) +
  xlab('') +
  ylab('Reads Assigned') +
  facet_grid(level~., scales='free_x') +
  theme(strip.background=element_blank(),
        panel.border = element_blank(), 
        axis.line = element_line(colour = "black", size = 0.5),
        panel.grid=element_blank(),
        axis.text.x=element_text(angle=45, vjust=1, hjust=1)) +
  scale_colour_manual(values=get_cat_palette(7), name='Read tally method')
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(p)

ggsave(here('results/plots/reads_assigned_anticodon_mimseqisodecoder.png'))
## Saving 7 x 5 in image
ggsave(here('results/plots/reads_assigned_anticodon_mimseqisodecoder.pdf'))
## Saving 7 x 5 in image

Repeat the plot at all levels of quantification

to_plot <- counts_vs_truth %>%
  bind_rows(.id='level') %>%
  mutate(tally_method=lapply(tally_method, function(x) tally_methods_rename[[x]])) %>%
  mutate(tally_method=factor(tally_method, levels=tally_methods_rename)) %>%
  filter(!grepl('shrimp', quant_method)) %>%
  group_by(level, input_file, simulation_n, quant_method, tally_method, trna_seq_method, sample, species) %>%
  summarise(sum_truth=sum(truth), sum_numreads=sum(NumReads)) %>%
  mutate(fraction_assigned=sum_numreads/sum_truth) %>%
  rowwise() %>%
  mutate(level=factor(rename_level(level), levels=levels_rename)) %>%
  mutate(quant_method=factor(rename_quant(quant_method), levels=quant_rename)) %>%
  mutate(trna_seq_method=factor(rename_trna_method(trna_seq_method), levels=trna_method_rename))
## `summarise()` has grouped output by 'level', 'input_file', 'simulation_n',
## 'quant_method', 'tally_method', 'trna_seq_method', 'sample'. You can override
## using the `.groups` argument.
p <- to_plot %>%
  ggplot(aes(trna_seq_method, fraction_assigned, colour=tally_method)) +
  geom_boxplot(position='dodge') +
  theme_bw(base_size=12) +
  xlab('') +
  ylab('Reads Assigned') +
  facet_grid(level~., scales='free_x') +
  theme(strip.background=element_blank(),
        panel.border = element_blank(), 
        axis.line = element_line(colour = "black", size = 0.5),
        panel.grid=element_blank(),
        axis.text.x=element_text(angle=45, vjust=1, hjust=1)) +
  scale_colour_manual(values=get_cat_palette(7), name='Read tally method')

print(p)

ggsave(here('results/plots/reads_assigned_all.png'))
## Saving 7 x 6 in image
ggsave(here('results/plots/reads_assigned_all.pdf'))
## Saving 7 x 6 in image

Define functions to calculate the MSE.

get_mse <- function(truth, estimate){
  mse <- mean((truth - estimate)^2)
  return(mse)
}

get_mse_normed <- function(truth, estimate){
  truth_norm = truth / sum(truth)
  estimate_norm = estimate / sum(estimate)
  
  mse <- mean((truth_norm - estimate_norm)^2)
  return(mse)
}

Calculate the MSE and correlation for each set of simulated samples for a tRNA gene for a given combination of the tRNA-seq method, quantification method.

inter_sample_metrics <- counts_vs_truth %>% lapply(function(x){
  
  x <- x %>% filter(!grepl('coli', Name)) %>% filter(!grepl('Und', Name))
  
  keep <- x %>% 
    group_by(trna_seq_method, quant_method, tally_method, simulation_n, Name) %>%
    summarise(n_values=sum(truth!=0)) %>%
    filter(n_values>0)
  
  x %>%
    merge(keep, by=c('trna_seq_method', 'quant_method', 'tally_method', 'simulation_n', 'Name')) %>%
    group_by(trna_seq_method, quant_method, tally_method, simulation_n, Name, species) %>%
    summarise(p_cor=cor(log(NumReads+1), log(truth+1)),
              s_cor=cor(log(NumReads+1), log(truth+1), method='spearman'),
              min_NumReads=min(NumReads),
              min_truth=min(truth),
              mse=get_mse(truth, NumReads),
              rmse=sqrt(mse),
              norm_mse=get_mse_normed(truth, NumReads),
              norm_rmse=sqrt(norm_mse))
})
## `summarise()` has grouped output by 'trna_seq_method', 'quant_method',
## 'tally_method', 'simulation_n'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'trna_seq_method', 'quant_method',
## 'tally_method', 'simulation_n', 'Name'. You can override using the `.groups`
## argument.
## `summarise()` has grouped output by 'trna_seq_method', 'quant_method',
## 'tally_method', 'simulation_n'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'trna_seq_method', 'quant_method',
## 'tally_method', 'simulation_n', 'Name'. You can override using the `.groups`
## argument.
## `summarise()` has grouped output by 'trna_seq_method', 'quant_method',
## 'tally_method', 'simulation_n'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'trna_seq_method', 'quant_method',
## 'tally_method', 'simulation_n', 'Name'. You can override using the `.groups`
## argument.
## `summarise()` has grouped output by 'trna_seq_method', 'quant_method',
## 'tally_method', 'simulation_n'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'trna_seq_method', 'quant_method',
## 'tally_method', 'simulation_n', 'Name'. You can override using the `.groups`
## argument.

Summarise the error metrics over all anticodons etc.

to_plot_summary <- lapply(names(inter_sample_metrics), function(name){
      data_to_use <- inter_sample_metrics[[name]] %>% filter(is.finite(p_cor), is.finite(rmse)) %>% ungroup() %>%
      mutate(tally_method=lapply(tally_method, function(x) tally_methods_rename[[x]])) %>%
      mutate(tally_method=factor(tally_method, levels=tally_methods_rename))

    n_methods <- data_to_use %>% select(quant_method, tally_method) %>% unique() %>% nrow()
    
    keep <- data_to_use %>%
          group_by(Name, simulation_n, trna_seq_method) %>%
          tally() %>%
          filter(n==n_methods)
  
    to_plot <- data_to_use %>%
      merge(keep, by=c('Name', 'trna_seq_method', 'simulation_n')) %>%
      mutate(level=rename_level(name)) %>%
      group_by(level, trna_seq_method, quant_method, tally_method, species) %>%
      summarise(mean_rmse=mean(norm_rmse),
                mean_cor=mean(p_cor))
}) %>% bind_rows() %>%
  rowwise() %>%
  mutate(quant_method=factor(rename_quant(quant_method), levels=quant_rename)) %>%
  mutate(trna_seq_method=factor(rename_trna_method(trna_seq_method), levels=trna_method_rename)) %>%
  ungroup() %>%
  mutate(level=factor(level, levels=unname(levels_rename)))
## `summarise()` has grouped output by 'level', 'trna_seq_method', 'quant_method',
## 'tally_method'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'level', 'trna_seq_method', 'quant_method',
## 'tally_method'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'level', 'trna_seq_method', 'quant_method',
## 'tally_method'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'level', 'trna_seq_method', 'quant_method',
## 'tally_method'. You can override using the `.groups` argument.

Identify the best/worst performing tally method and then plot heatmap with best/worst highlighted. Plot only Bowtie2/mimseq at anticodon/mimseq isodecoder level.

to_plot_summmary_finalised <- to_plot_summary %>%
  filter(quant_method!='SHRiMP') %>%
  filter(level %in% c('Anticodon', 'Mimseq isodecoder'))

highlight_best_cor <- to_plot_summmary_finalised %>%
  group_by(level, trna_seq_method) %>%
  slice_max(n=1, order_by=mean_cor) %>%
  mutate(label='\U002A')

highlight_best_rmse <- to_plot_summmary_finalised %>%
  group_by(level, trna_seq_method) %>%
  slice_min(n=1, order_by=mean_rmse) %>%
  mutate(label='\U002A')

highlight_worst_cor <- to_plot_summmary_finalised %>%
  group_by(level, trna_seq_method) %>%
  slice_min(n=1, order_by=mean_cor) %>%
  mutate(label='\U25BC')

highlight_worst_rmse <- to_plot_summmary_finalised %>%
  group_by(level, trna_seq_method) %>%
  slice_max(n=1, order_by=mean_rmse) %>%
  mutate(label='\U25BC')



p <- to_plot_summmary_finalised %>%
  ggplot(aes(trna_seq_method, tally_method)) +
  xlab('') +
  ylab('') +
  facet_grid(level~.) +
  theme_camprot(border=FALSE, base_family='sans', base_size=15, aspect_square=FALSE) +
    theme(strip.background=element_blank(),
        panel.grid=element_blank(),
        strip.text=element_text(size=15),
        axis.text.x=element_text(angle=45, vjust=1, hjust=1))

p1 <- p +
        geom_tile(aes(fill=mean_rmse)) +
        scale_fill_continuous(high='grey90', low=get_cat_palette(2)[2], name='Mean RMSE',
                              limits=c(0,0.2), breaks=seq(0,0.2,0.04)) +
        geom_text(aes(label=round(mean_rmse, 3))) +
        geom_text(data=highlight_best_rmse, aes(label=label), size=10, colour='black', vjust=0.25) +
        geom_text(data=highlight_worst_rmse, aes(label=label), size=5, colour='black', vjust=-0.5)

p2 <- p +
        geom_tile(aes(fill=mean_cor)) +
        scale_fill_continuous(low='grey90', high=get_cat_palette(3)[3], name='Mean\nPearson\nCorrelation',
                              limits=c(0.6,1), breaks=seq(0.6,1,0.1)) +
        geom_text(aes(label=round(mean_cor, 3))) +
        geom_text(data=highlight_best_cor, aes(label=label), size=10, colour='black', vjust=0.25) +
        geom_text(data=highlight_worst_cor, aes(label=label), size=5, colour='black', vjust=-0.5)

print(p1)

print(p2)

ggsave(here('results/plots/mean_rmse_bowtie2_mimseq.png'), plot=p1)
## Saving 8 x 9 in image
# ggsave doesn't work for some unicode characters. Using solution proposed here:
# https://stackoverflow.com/questions/44547350/corrupted-utf-characters-in-pdf-plots-generated-by-r/44548861#44548861
dev.off()
## null device 
##           1
quartz(type = 'pdf', file = here('results/plots/mean_rmse_bowtie2_mimseq.pdf'), height = 9, width=6)
print(p1)

ggsave(here('results/plots/mean_cor_bowtie2_mimseq.png'), plot=p2)
## Saving 6 x 9 in image
dev.off()
## null device 
##           1
quartz(type = 'pdf', file = here('results/plots/mean_cor_bowtie2_mimseq.pdf'), height = 9, width=6)
print(p2)

Repeat the plotting for all aligners and all levels of quantification.

highlight_best_cor_all <- to_plot_summary %>%
  group_by(level, trna_seq_method) %>%
  slice_max(n=1, order_by=mean_cor) %>%
  mutate(label='\u002A')

highlight_best_rmse_all <- to_plot_summary %>%
  group_by(level, trna_seq_method) %>%
  slice_min(n=1, order_by=mean_rmse) %>%
  mutate(label='\u002A')

highlight_worst_cor_all <- to_plot_summary %>%
  group_by(level, trna_seq_method) %>%
  slice_min(n=1, order_by=mean_cor) %>%
  mutate(label='\u25BC')

highlight_worst_rmse_all <- to_plot_summary %>%
  group_by(level, trna_seq_method) %>%
  slice_max(n=1, order_by=mean_rmse) %>%
  mutate(label='\u25BC')


p <- to_plot_summary %>%
  ggplot(aes(trna_seq_method, tally_method)) +
  xlab('') +
  ylab('') +
  facet_grid(level~species, scales='free') +
  theme_camprot(border=FALSE, base_family='sans', base_size=15, aspect_square=FALSE) +
    theme(strip.background=element_blank(),
        panel.grid=element_blank(),
        strip.text=element_text(size=15),
        axis.text.x=element_text(angle=45, vjust=1, hjust=1))

        
  
p <- to_plot_summary %>%
  ggplot(aes(trna_seq_method, tally_method)) +
  xlab('') +
  ylab('') +
  facet_grid(quant_method~level, scales='free', space='free') +
  theme_camprot(border=FALSE, base_family='sans', base_size=15, aspect_square=FALSE) +
    theme(strip.background=element_blank(),
        panel.grid=element_blank(),
        strip.text=element_text(size=15),
        panel.spacing.y=unit(5, 'mm'),
        axis.text.x=element_text(angle=45, hjust=1, vjust=1))

p1 <- p +
        geom_tile(aes(fill=mean_rmse)) +
        scale_fill_continuous(high='grey90', low=get_cat_palette(2)[2], name='Mean RMSE',
                              limits=c(0,0.2), breaks=seq(0,0.2,0.04)) +
        geom_text(aes(label=round(mean_rmse, 2))) +
        geom_tile(data=highlight_best_rmse_all, colour='black', fill=NA, size=1)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
p2 <- p +
        geom_tile(aes(fill=mean_cor)) +
        scale_fill_continuous(low='grey90', high=get_cat_palette(3)[3], name='Mean\nPearson\nCorrelation',
                      limits=c(0.6,1), breaks=seq(0.6,1,0.1)) +
        geom_text(aes(label=round(mean_cor, 2))) +
        geom_tile(data=highlight_best_cor_all, colour='black', fill=NA, size=1)

remove_clip <- function(p){
  pg <- ggplotGrob(p)

  for(i in which(grepl("strip", pg$layout$name))){
    pg$grobs[[i]]$layout$clip <- "off"
  }
    
  return(pg)
}
  

pg1 <- remove_clip(p1)
grid::grid.draw(pg1)  

png(here('results/plots/mean_rmse_all.png'), width=10, height=12, units='in', res=400)
grid::grid.draw(pg1)
dev.off()
## quartz_off_screen 
##                 2
pdf(here('results/plots/mean_rmse_all.pdf'), width=10, height=12)
grid::grid.draw(pg1)
dev.off()
## quartz_off_screen 
##                 2
pg2 <- remove_clip(p2)
grid::grid.draw(pg2)  

png(here('results/plots/mean_cor_all.png'), width=10, height=12, units='in', res=400)
grid::grid.draw(pg2)
dev.off()
## quartz_off_screen 
##                 2
pdf(here('results/plots/mean_cor_all.pdf'), width=10, height=12)
grid::grid.draw(pg2)
dev.off()
## quartz_off_screen 
##                 2
tally_method_levels <- c('random_single',
                         'fractional',
                         'no_multi',
                         'mapq10',
                         'decision',
                         'salmon',
                         'mimseq')

Summarise the mean error metrics for each anticodon

to_plot <- inter_sample_metrics$anticodon %>%
  mutate(Name=gsub('Homo_sapiens_mito_tRNA|Homo_sapiens_MTtRNA|Mus_musculus_mito_tRNA|Mus_musculus_MTtRNA', 'MT',
                   gsub('Homo_sapiens_tRNA-|Mus_musculus_tRNA-', '', Name))) %>%
  group_by(trna_seq_method, quant_method, tally_method, Name) %>%
  summarise(norm_rmse=mean(norm_rmse, na.rm=TRUE),
            p_cor=mean(p_cor, na.rm=TRUE)) %>%
  rowwise() %>%
  mutate(tally_method=rename_method(tally_method)) %>%
  ungroup()
## `summarise()` has grouped output by 'trna_seq_method', 'quant_method',
## 'tally_method'. You can override using the `.groups` argument.

Reformat to wider format for pheatmap. Using just YAMATseq and Bowtie2/mimseq.

heatmap_data_cor <- to_plot %>% filter(!grepl('MT', Name), quant_method!='shrimp', trna_seq_method=='YAMATseq') %>%
  pivot_wider(id_cols=Name, values_from=p_cor, names_from=tally_method) %>%
  tibble::column_to_rownames('Name')

Plot the heatmaps.

filename = here("results/plots/mimtRNAseq_p_cor_anticodon.png")
filename2 = here("results/plots/mimtRNAseq_p_cor_anticodon.pdf")


pheatmap(heatmap_data_cor, color=colorRampPalette(c(get_cat_palette(2)[2], 'grey95', get_cat_palette(1)))(50),
         breaks=seq(-0.5,1,length.out=50),
         annotation_colors=NULL, cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, clustering_method='average', fontsize_row=7,
         labels_col=colnames(heatmap_data_cor))

pheatmap(heatmap_data_cor, color=colorRampPalette(c(get_cat_palette(2)[2], 'grey95', get_cat_palette(1)))(50),
         breaks=seq(-0.5,1,length.out=50),
         annotation_colors=NULL, cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, clustering_method='average', fontsize_row=7,
         labels_col=colnames(heatmap_data_cor),
         filename=filename)

pheatmap(heatmap_data_cor, color=colorRampPalette(c(get_cat_palette(2)[2], 'grey95', get_cat_palette(1)))(50),
         breaks=seq(-0.5,1,length.out=50),
         annotation_colors=NULL, cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, clustering_method='average', fontsize_row=7,
         labels_col=colnames(heatmap_data_cor),
         filename=filename2)

Below, we pick the 6 anticodons with the biggest difference between bowtie2-decision and mimseq

Plot scatter plots of truth vs estimates for YAMATseq for the anticodons with the greatest difference between Decision and mimseq.

aoi <- heatmap_data_cor %>% mutate(diff=abs(Decision-Mimseq)) %>% arrange(desc(diff)) %>% head(6) 
aoi <- rownames(aoi)

tsm <- 'YAMATseq'

p <- counts_vs_truth$anticodon %>%
  filter(trna_seq_method==tsm, grepl(paste(aoi, collapse='|'), Name), quant_method!='shrimp') %>%
  filter(trna_seq_method=='YAMATseq') %>%
  filter(!grepl('MT', Name)) %>%
  mutate(Name=factor(gsub('Homo_sapiens_tRNA-|Mus_musculus_tRNA-', '', Name), levels=aoi))%>%
  mutate(tally_method=lapply(tally_method, function(x) tally_methods_rename[[x]])) %>%
  mutate(tally_method=factor(tally_method, levels=tally_methods_rename)) %>%
  ggplot(aes(log2(truth), log2(NumReads))) +
  geom_point(size=0.5, alpha=0.5) +
  #theme_camprot(base_size=12) +
  facet_grid(Name~tally_method) +
  theme(strip.background=element_blank()) +
  geom_abline(slope=1, linetype=2, colour='grey') +
  theme_bw(base_size=15, base_family='sans') +
  theme(strip.background=element_blank(), panel.grid=element_blank(),
        panel.border=element_blank(), aspect.ratio=1) +
  xlim(0, 20) +
  ylim(0, 20) +
  xlab('Truth (log2)') +
  ylab('Estimate (log2)')
  
print(p)
## Warning: Removed 4 rows containing missing values (`geom_point()`).
pg <- remove_clip(p)
## Warning: Removed 4 rows containing missing values (`geom_point()`).
grid::grid.draw(pg)  

png(here('results/plots/aoi_truth_vs_estimate_scatter.png'), width=8, height=8, units='in', res=400)
grid::grid.draw(pg)
dev.off()
## quartz_off_screen 
##                 2
pdf(here('results/plots/aoi_truth_vs_estimate_scatter.pdf'), width=8, height=8)
grid::grid.draw(pg)
dev.off()
## quartz_off_screen 
##                 2

Plot the correlation for YAMAT-Seq for the anticodons with the greatest difference between Decision and mimseq.

p <- inter_sample_metrics$anticodon %>% filter(grepl(paste(aoi, collapse='|'), Name), quant_method!='shrimp') %>%
  filter(!grepl('MT', Name)) %>%
  filter(trna_seq_method=='YAMATseq') %>%
  mutate(tally_method=lapply(tally_method, function(x) tally_methods_rename[[x]])) %>%
  mutate(tally_method=factor(tally_method, levels=tally_methods_rename)) %>%
  mutate(Name=gsub('Homo_sapiens_mito_tRNA|Homo_sapiens_MTtRNA|Mus_musculus_mito_tRNA|Mus_musculus_MTtRNA', 'MT',
                   gsub('Homo_sapiens_tRNA-|Mus_musculus_tRNA-', '', Name))) %>%
  mutate(Name=factor(Name, levels=aoi)) %>%
      ggplot(aes(Name, p_cor, colour=tally_method, group=tally_method)) +
    stat_summary(geom='point', fun='mean',  position=position_dodge(width=1)) +
    stat_summary(geom='errorbar',  position=position_dodge(width=1)) +
    facet_wrap(~Name, scales='free_x', nrow=2) +
    #theme_camprot(base_size=10) +
    scale_colour_manual(values=get_cat_palette(7), name='') +
    xlab('') +
  theme_camprot(base_family='sans', base_size=10, border=FALSE) +
  theme(strip.text=element_blank()) +
  ylim(NA, 1) +
  ylab('Pearson correlation')

print(p)
## Warning: Removed 2 rows containing non-finite values (`stat_summary()`).
## Removed 2 rows containing non-finite values (`stat_summary()`).
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`

ggsave(here('results/plots/p_cor_anticodon_highlight.png'), plot=p)
## Saving 4 x 4 in image
## Warning: Removed 2 rows containing non-finite values (`stat_summary()`).
## Removed 2 rows containing non-finite values (`stat_summary()`).
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
ggsave(here('results/plots/p_cor_anticodon_highlight.pdf'), plot=p)
## Saving 4 x 4 in image
## Warning: Removed 2 rows containing non-finite values (`stat_summary()`).
## Removed 2 rows containing non-finite values (`stat_summary()`).
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`

Read in the results from notebook 4 regarding the fraction of correct read assignments.

diff_quant_seq_methods <- readRDS('../results/diff_quant_seq_methods.rds')
diff_quant_seq_methods <- diff_quant_seq_methods %>% lapply(function(x) filter(x, is.finite(fraction)))

Correlate the fraction of read assignments vs the Pearson correlation coefficient.

to_plot_metrics_vs_correct_alignment_rate <- diff_quant_seq_methods %>% names() %>%
  lapply(function(level){
    inter_sample_metrics[[level]] %>%
      mutate(truth=gsub('Homo_sapiens_tRNA-|Homo_sapiens_tRX-|Mus_musculus_tRNA-|Mus_musculus_tRX-', '',
                        gsub('Homo_sapiens_MTtRNA|Mus_musculus_MTtRNA', 'MT', Name))) %>%
      group_by(truth, trna_seq_method, quant_method, tally_method) %>%
      summarise(mean_p_cor=mean(p_cor, na.rm=TRUE),
                mean_rmse=mean(norm_rmse, na.rm=TRUE)) %>%
      filter(is.finite(mean_p_cor)) %>%
      merge(diff_quant_seq_methods[[level]], by=c('truth', 'trna_seq_method', 'quant_method'), all.x=TRUE)
  })
## `summarise()` has grouped output by 'truth', 'trna_seq_method', 'quant_method'.
## You can override using the `.groups` argument.
## `summarise()` has grouped output by 'truth', 'trna_seq_method', 'quant_method'.
## You can override using the `.groups` argument.
## `summarise()` has grouped output by 'truth', 'trna_seq_method', 'quant_method'.
## You can override using the `.groups` argument.
## `summarise()` has grouped output by 'truth', 'trna_seq_method', 'quant_method'.
## You can override using the `.groups` argument.
names(to_plot_metrics_vs_correct_alignment_rate) <- names(diff_quant_seq_methods)

for(level in names(to_plot_metrics_vs_correct_alignment_rate)){
  p <- to_plot_metrics_vs_correct_alignment_rate[[level]] %>%
    mutate(tally_method=lapply(tally_method, function(x) tally_methods_rename[[x]])) %>%
    mutate(tally_method=factor(tally_method, levels=tally_methods_rename)) %>%
    mutate(trna_seq_method=unlist(lapply(trna_seq_method, rename_trna_method))) %>%
    filter(quant_method!='shrimp') %>%
    ggplot(aes(fraction, mean_p_cor)) +
    geom_point(size=0.25, alpha=0.25) +
    geom_smooth(method='lm', se=FALSE) +
    theme_camprot(base_family='sans', base_size = 10, border=FALSE) +
    scale_x_continuous(limits=c(0, 1), breaks=seq(0,1,0.2), name='Fraction correct read assignment') +
    scale_y_continuous(limits=c(NA, 1), name='Pearson correlation vs truth') +
    ggtitle(rename_level(level)) +
    theme(strip.background=element_blank(), panel.spacing=unit(1, "lines")) +
    facet_grid(trna_seq_method~tally_method) +
    theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1))

  pg <- remove_clip(p)

  print(p)
  plot(pg)

  png(here(sprintf('results/plots/read_assignment_vs_cor_%s.png', level)))
  grid::grid.draw(pg)
  dev.off()
  
  pdf(here(sprintf('results/plots/read_assignment_vs_cor_%s.pdf', level)))
  grid::grid.draw(pg)
  dev.off()

}
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 4 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 4 rows containing missing values (`geom_point()`).
## Warning: Removed 2 rows containing missing values (`geom_smooth()`).

## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 4 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 4 rows containing missing values (`geom_point()`).
## Warning: Removed 2 rows containing missing values (`geom_smooth()`).

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1573 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1573 rows containing missing values (`geom_point()`).

## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1573 rows containing non-finite values (`stat_smooth()`).
## Removed 1573 rows containing missing values (`geom_point()`).

sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] RColorBrewer_1.1-3  pheatmap_1.0.12     ggbeeswarm_0.7.2   
## [4] camprotR_0.0.0.9000 ggplot2_3.4.2       tidyr_1.2.0        
## [7] dplyr_1.0.8         here_1.0.1         
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-155          ProtGenerics_1.26.0   doParallel_1.0.17    
##  [4] rprojroot_2.0.2       MSnbase_2.20.4        tools_4.1.2          
##  [7] bslib_0.3.1           utf8_1.2.3            R6_2.5.1             
## [10] affyio_1.64.0         vipor_0.4.5           mgcv_1.8-39          
## [13] DBI_1.1.2             BiocGenerics_0.40.0   colorspace_2.1-0     
## [16] withr_2.5.0           tidyselect_1.1.2      compiler_4.1.2       
## [19] preprocessCore_1.56.0 textshaping_0.3.6     cli_3.6.1            
## [22] Biobase_2.54.0        labeling_0.4.2        sass_0.4.0           
## [25] scales_1.2.1          DEoptimR_1.0-10       robustbase_0.93-9    
## [28] affy_1.72.0           systemfonts_1.0.4     stringr_1.5.0        
## [31] digest_0.6.29         rmarkdown_2.12        pkgconfig_2.0.3      
## [34] htmltools_0.5.2       fastmap_1.1.0         limma_3.50.1         
## [37] highr_0.9             rlang_1.1.1           rstudioapi_0.13      
## [40] impute_1.68.0         jquerylib_0.1.4       generics_0.1.2       
## [43] farver_2.1.1          jsonlite_1.8.0        mzID_1.32.0          
## [46] BiocParallel_1.28.3   magrittr_2.0.3        Matrix_1.4-0         
## [49] MALDIquant_1.21       Rcpp_1.0.8.3          munsell_0.5.0        
## [52] S4Vectors_0.32.3      fansi_1.0.4           MsCoreUtils_1.6.2    
## [55] lifecycle_1.0.3       vsn_3.62.0            stringi_1.7.6        
## [58] yaml_2.3.5            MASS_7.3-55           zlibbioc_1.40.0      
## [61] plyr_1.8.6            grid_4.1.2            parallel_4.1.2       
## [64] crayon_1.5.0          lattice_0.20-45       splines_4.1.2        
## [67] mzR_2.28.0            knitr_1.37            pillar_1.9.0         
## [70] codetools_0.2-18      stats4_4.1.2          XML_3.99-0.9         
## [73] glue_1.6.2            evaluate_0.15         pcaMethods_1.86.0    
## [76] BiocManager_1.30.16   vctrs_0.6.3           foreach_1.5.2        
## [79] gtable_0.3.3          purrr_0.3.4           clue_0.3-60          
## [82] assertthat_0.2.1      xfun_0.30             ragg_1.2.5           
## [85] ncdf4_1.19            tibble_3.2.1          iterators_1.0.14     
## [88] beeswarm_0.4.0        IRanges_2.28.0        cluster_2.1.2        
## [91] ellipsis_0.3.2